# %% 利用额外找到的POI数据集预训练bert wwm
# 基于pytorch lightning进行改写
import json
import os
from tqdm import tqdm
from transformers import BertTokenizer, BertModel, LineByLineTextDataset
from pytorch_lightning import seed_everything



pretrain_model = "./chinese-roberta-wwm-ext"
folder = "./extend_poi_dataset"

# %%
tokenizer = BertTokenizer.from_pretrained(pretrain_model)
model = BertModel.from_pretrained(pretrain_model)
dataset = LineByLineTextDataset(tokenizer, file_path="./extend_poi_dataset/poi.json.txt", block_size=tokenizer.model_max_length)
# %%
